library(DataExplorer)
library(skimr)
library(LearnEDA)
## Loading required package: aplpack
## Loading required package: vcd
## Loading required package: grid
library(LearnEDAfunctions)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: ggplot2
##
## Attaching package: 'LearnEDAfunctions'
## The following objects are masked from 'package:LearnEDA':
##
## fit.gaussian, half.slope.ratio, han, hinkley, lval, mtrans,
## plot2way, power.t, rline, symplot
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ✔ readr 2.1.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(LearnBayes)
library(Hmisc)
##
## Attaching package: 'Hmisc'
##
## The following objects are masked from 'package:dplyr':
##
## src, summarize
##
## The following objects are masked from 'package:base':
##
## format.pval, units
A data frame with 322 observations of major league players on the following 20 variables.
| Variable | Description |
|---|---|
| AtBat | Number of times at bat in 1986 |
| Hits | Number of hits in 1986 |
| HmRun | Number of home runs in 1986 |
| Runs | Number of runs in 1986 |
| RBI | Number of runs batted in in 1986 |
| Walks | Number of walks in 1986 |
| Years | Number of years in the major leagues |
| CAtBat | Number of times at bat during his career |
| CHits | Number of hits during his career |
| CHmRun | Number of home runs during his career |
| CRuns | Number of runs during his career |
| CRBI | Number of runs batted in during his career |
| CWalks | Number of walks during his career |
| League | A factor with levels A and N indicating player’s league at the end of 1986 |
| Division | A factor with levels E and W indicating player’s division at the end of 1986 |
| PutOuts | Number of put outs in 1986 |
| Assists | Number of assists in 1986 |
| Errors | Number of errors in 1986 |
| Salary | 1987 annual salary on opening day in thousands of dollars |
| NewLeague | A factor with levels A and N indicating player’s league at the beginning of 1987 |
df0 = read.csv("/Users/shegeb/Documents/MY PROJECTS/exploratory_data_analysis/Hitters.csv")
head(df0,1)
## AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI CWalks
## 1 293 66 1 30 29 14 1 293 66 1 30 29 14
## League Division PutOuts Assists Errors Salary NewLeague
## 1 A E 446 33 20 NA A
str(df0)
## 'data.frame': 322 obs. of 20 variables:
## $ AtBat : int 293 315 479 496 321 594 185 298 323 401 ...
## $ Hits : int 66 81 130 141 87 169 37 73 81 92 ...
## $ HmRun : int 1 7 18 20 10 4 1 0 6 17 ...
## $ Runs : int 30 24 66 65 39 74 23 24 26 49 ...
## $ RBI : int 29 38 72 78 42 51 8 24 32 66 ...
## $ Walks : int 14 39 76 37 30 35 21 7 8 65 ...
## $ Years : int 1 14 3 11 2 11 2 3 2 13 ...
## $ CAtBat : int 293 3449 1624 5628 396 4408 214 509 341 5206 ...
## $ CHits : int 66 835 457 1575 101 1133 42 108 86 1332 ...
## $ CHmRun : int 1 69 63 225 12 19 1 0 6 253 ...
## $ CRuns : int 30 321 224 828 48 501 30 41 32 784 ...
## $ CRBI : int 29 414 266 838 46 336 9 37 34 890 ...
## $ CWalks : int 14 375 263 354 33 194 24 12 8 866 ...
## $ League : chr "A" "N" "A" "N" ...
## $ Division : chr "E" "W" "W" "E" ...
## $ PutOuts : int 446 632 880 200 805 282 76 121 143 0 ...
## $ Assists : int 33 43 82 11 40 421 127 283 290 0 ...
## $ Errors : int 20 10 14 3 4 25 7 9 19 0 ...
## $ Salary : num NA 475 480 500 91.5 750 70 100 75 1100 ...
## $ NewLeague: chr "A" "N" "A" "N" ...
dim(df0)
## [1] 322 20
skim(df0)
| Name | df0 |
| Number of rows | 322 |
| Number of columns | 20 |
| _______________________ | |
| Column type frequency: | |
| character | 3 |
| numeric | 17 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| League | 0 | 1 | 1 | 1 | 0 | 2 | 0 |
| Division | 0 | 1 | 1 | 1 | 0 | 2 | 0 |
| NewLeague | 0 | 1 | 1 | 1 | 0 | 2 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| AtBat | 0 | 1.00 | 380.93 | 153.40 | 16.0 | 255.25 | 379.5 | 512.00 | 687 | ▁▇▇▇▅ |
| Hits | 0 | 1.00 | 101.02 | 46.45 | 1.0 | 64.00 | 96.0 | 137.00 | 238 | ▃▇▆▃▁ |
| HmRun | 0 | 1.00 | 10.77 | 8.71 | 0.0 | 4.00 | 8.0 | 16.00 | 40 | ▇▃▂▁▁ |
| Runs | 0 | 1.00 | 50.91 | 26.02 | 0.0 | 30.25 | 48.0 | 69.00 | 130 | ▅▇▆▃▁ |
| RBI | 0 | 1.00 | 48.03 | 26.17 | 0.0 | 28.00 | 44.0 | 64.75 | 121 | ▃▇▃▃▁ |
| Walks | 0 | 1.00 | 38.74 | 21.64 | 0.0 | 22.00 | 35.0 | 53.00 | 105 | ▅▇▅▂▁ |
| Years | 0 | 1.00 | 7.44 | 4.93 | 1.0 | 4.00 | 6.0 | 11.00 | 24 | ▇▆▃▂▁ |
| CAtBat | 0 | 1.00 | 2648.68 | 2324.21 | 19.0 | 816.75 | 1928.0 | 3924.25 | 14053 | ▇▃▂▁▁ |
| CHits | 0 | 1.00 | 717.57 | 654.47 | 4.0 | 209.00 | 508.0 | 1059.25 | 4256 | ▇▃▁▁▁ |
| CHmRun | 0 | 1.00 | 69.49 | 86.27 | 0.0 | 14.00 | 37.5 | 90.00 | 548 | ▇▁▁▁▁ |
| CRuns | 0 | 1.00 | 358.80 | 334.11 | 1.0 | 100.25 | 247.0 | 526.25 | 2165 | ▇▂▁▁▁ |
| CRBI | 0 | 1.00 | 330.12 | 333.22 | 0.0 | 88.75 | 220.5 | 426.25 | 1659 | ▇▂▁▁▁ |
| CWalks | 0 | 1.00 | 260.24 | 267.06 | 0.0 | 67.25 | 170.5 | 339.25 | 1566 | ▇▂▁▁▁ |
| PutOuts | 0 | 1.00 | 288.94 | 280.70 | 0.0 | 109.25 | 212.0 | 325.00 | 1378 | ▇▃▁▁▁ |
| Assists | 0 | 1.00 | 106.91 | 136.85 | 0.0 | 7.00 | 39.5 | 166.00 | 492 | ▇▂▁▁▁ |
| Errors | 0 | 1.00 | 8.04 | 6.37 | 0.0 | 3.00 | 6.0 | 11.00 | 32 | ▇▅▂▁▁ |
| Salary | 59 | 0.82 | 535.93 | 451.12 | 67.5 | 190.00 | 425.0 | 750.00 | 2460 | ▇▃▁▁▁ |
baseball = drop_na(df0)
dim(baseball)
## [1] 263 20
hist(baseball$Salary)
This is right/positively skewed
stem.leaf(baseball$Salary)
## 1 | 2: represents 120
## leaf unit: 10
## n: 263
## 24 0 | 667777777777888999999999
## 69 1 | 000000001111122233334444455555666677777889999
## 102 2 | 000001111122334444555555667777789
## 124 3 | 0000002222444555666788
## (23) 4 | 00001122223355555777889
## 116 5 | 0000112233355567889
## 97 6 | 00001223455677
## 83 7 | 0000002333444555555556777788
## 55 8 | 001255557777
## 43 9 | 0002234556
## 33 10 | 0000445
## 26 11 | 0578
## 22 12 | 0236
## 18 13 | 00015
## 13 14 | 5
## 12 15 | 0
## HI: 1600 1670 1800 1861.46 1900 1925.571 1940 1975 2127.333 2412.5 2460
class(baseball)
## [1] "data.frame"
# test = subset(baseball, select = -c(League, Division, NewLeague, Salary) )
# test
#heatmap(baseball)
# heatmap(test, Rowv = NA, Colv = NA)
Why we reexpress data?
baseball$Salary = ifelse( ( 0<baseball$Salary & baseball$Salary<=400 ), 1, baseball$Salary)
baseball$Salary = ifelse( ( 400<baseball$Salary & baseball$Salary<=800 ), 2, baseball$Salary)
baseball$Salary = ifelse( ( 800<baseball$Salary & baseball$Salary<=1200 ), 3, baseball$Salary)
baseball$Salary = ifelse( ( 1200<baseball$Salary & baseball$Salary<=1600 ), 4, baseball$Salary)
baseball$Salary = ifelse( ( 1600<baseball$Salary & baseball$Salary<=2000 ), 5, baseball$Salary)
baseball$Salary = ifelse( ( 2000<baseball$Salary & baseball$Salary<=2500 ), 6, baseball$Salary)
baseball$Salary = as.factor(baseball$Salary)
skim(baseball)
| Name | baseball |
| Number of rows | 263 |
| Number of columns | 20 |
| _______________________ | |
| Column type frequency: | |
| character | 3 |
| factor | 1 |
| numeric | 16 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| League | 0 | 1 | 1 | 1 | 0 | 2 | 0 |
| Division | 0 | 1 | 1 | 1 | 0 | 2 | 0 |
| NewLeague | 0 | 1 | 1 | 1 | 0 | 2 | 0 |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| Salary | 0 | 1 | FALSE | 6 | 1: 128, 2: 82, 3: 32, 4: 11 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| AtBat | 0 | 1 | 403.64 | 147.31 | 19 | 282.5 | 413 | 526.0 | 687 | ▁▇▇▇▆ |
| Hits | 0 | 1 | 107.83 | 45.13 | 1 | 71.5 | 103 | 141.5 | 238 | ▂▇▇▅▁ |
| HmRun | 0 | 1 | 11.62 | 8.76 | 0 | 5.0 | 9 | 18.0 | 40 | ▇▅▃▂▁ |
| Runs | 0 | 1 | 54.75 | 25.54 | 0 | 33.5 | 52 | 73.0 | 130 | ▃▇▇▃▁ |
| RBI | 0 | 1 | 51.49 | 25.88 | 0 | 30.0 | 47 | 71.0 | 121 | ▂▇▅▃▁ |
| Walks | 0 | 1 | 41.11 | 21.72 | 0 | 23.0 | 37 | 57.0 | 105 | ▅▇▅▃▁ |
| Years | 0 | 1 | 7.31 | 4.79 | 1 | 4.0 | 6 | 10.0 | 24 | ▇▆▃▂▁ |
| CAtBat | 0 | 1 | 2657.54 | 2286.58 | 19 | 842.5 | 1931 | 3890.5 | 14053 | ▇▃▁▁▁ |
| CHits | 0 | 1 | 722.19 | 648.20 | 4 | 212.0 | 516 | 1054.0 | 4256 | ▇▃▁▁▁ |
| CHmRun | 0 | 1 | 69.24 | 82.20 | 0 | 15.0 | 40 | 92.5 | 548 | ▇▁▁▁▁ |
| CRuns | 0 | 1 | 361.22 | 331.20 | 2 | 105.5 | 250 | 497.5 | 2165 | ▇▂▁▁▁ |
| CRBI | 0 | 1 | 330.42 | 323.37 | 3 | 95.0 | 230 | 424.5 | 1659 | ▇▃▁▁▁ |
| CWalks | 0 | 1 | 260.27 | 264.06 | 1 | 71.0 | 174 | 328.5 | 1566 | ▇▂▁▁▁ |
| PutOuts | 0 | 1 | 290.71 | 279.93 | 0 | 113.5 | 224 | 322.5 | 1377 | ▇▃▁▁▁ |
| Assists | 0 | 1 | 118.76 | 145.08 | 0 | 8.0 | 45 | 192.0 | 492 | ▇▂▂▁▁ |
| Errors | 0 | 1 | 8.59 | 6.61 | 0 | 3.0 | 7 | 13.0 | 32 | ▇▅▃▁▁ |
# Steam plot is not suitable for this dataset because this dataset has more than 300 observations.
# stem.leaf(baseball$AtBat, unit= 10, m=1, depth=TRUE)
# stem.leaf(baseball$Hits, unit= 1, m=1, depth=TRUE)
#
df2 = baseball[, c("AtBat", "Hits")]
pairs(df2)
with(df2,
plot(df2$AtBat,
df2$Salary,
xlab="Salary",
ylab="AtBat")
)
with(df2,
plot(df2$Hits,
df2$Salary,
xlab="Salary",
ylab="Hits")
)
hist(baseball$AtBat)
hist(baseball$Hits)
library(Hmisc)
df_1 = baseball[ , c("AtBat", "Hits")]
hist.data.frame(df_1)
hist(baseball$AtBat,
freq=FALSE
)
lines(density(baseball$AtBat, bw=1.5),
lwd=2
)
hist(baseball$Hits,
freq=FALSE
)
lines(density(baseball$Hits, bw=1.5),
lwd=2
)
symplot(baseball$AtBat)
symplot(baseball$Hits)
AtBat, and
Hits.AtBat: slightly left skewed, since most of the points
are below the line (u=v)Hits: right skewness, since most of the points are
above the line (u=v)d2 <- lval_plus(baseball, baseball$Hits)
filter(d2, OUT == TRUE)
## [1] AtBat Hits HmRun Runs RBI Walks Years
## [8] CAtBat CHits CHmRun CRuns CRBI CWalks League
## [15] Division PutOuts Assists Errors Salary NewLeague Fence_LO
## [22] Fence_HI OUT
## <0 rows> (or 0-length row.names)
d3 <- lval_plus(baseball, baseball$AtBat)
filter(d3, OUT == TRUE)
## [1] AtBat Hits HmRun Runs RBI Walks Years
## [8] CAtBat CHits CHmRun CRuns CRBI CWalks League
## [15] Division PutOuts Assists Errors Salary NewLeague Fence_LO
## [22] Fence_HI OUT
## <0 rows> (or 0-length row.names)
baseball = baseball[which(baseball$AtBat>0),]
intersect(d2,d3)
## [1] AtBat Hits HmRun Runs RBI Walks Years
## [8] CAtBat CHits CHmRun CRuns CRBI CWalks League
## [15] Division PutOuts Assists Errors Salary NewLeague Fence_LO
## [22] Fence_HI OUT
## <0 rows> (or 0-length row.names)
There are several reasons we may want to re- express our data:
boxplot(AtBat~ Salary, horizontal = TRUE,
data = baseball,
xlab = "AtBat",
ylab = "Salary", show.names=TRUE)
boxplot(Hits~ Salary, horizontal = TRUE,
data = baseball,
xlab = "Hits",
ylab = "Salary", show.names=TRUE)
Comments
spread.level.plot(baseball$AtBat,
baseball$Salary)
## GROUP medians dfs log.median log.df
## 1 2 319.0 225.5 2.503791 2.353147
## 2 1 450.0 201.0 2.653213 2.303196
## 3 3 512.5 136.0 2.709694 2.133539
## 4 4 550.0 143.5 2.740363 2.156852
## 5 5 565.0 78.5 2.752048 1.894870
## 6 6 495.0 299.0 2.694605 2.475671
spread.level.plot(baseball$Hits,
baseball$Salary)
## GROUP medians dfs log.median log.df
## 1 2 79.5 56.0 1.900367 1.748188
## 2 1 117.5 71.0 2.070038 1.851258
## 3 3 137.0 43.0 2.136721 1.633468
## 4 4 142.0 50.5 2.152288 1.703291
## 5 5 160.0 21.0 2.204120 1.322219
## 6 6 151.0 99.5 2.178977 1.997823
Comments
AtBat:
- Clearly there is a negative association in the graph, indicating that
batches with small medians tend to have large dfs (spreads).
Hits:
- Also there is a week negative association, indicating that batches
with small medians tend to also have small dfs (spreads). That means
that there is a strong dependence between spread and level.
AtBat: slop = -1.8
The slope of the line (as shown in the plot) is b = -1.8. So the power
of the reexpression is p = 1 − b = 1 + 1.8 = 2.8 which is approximatly
3
Hits: slop = -0.68
The slope of the line is b = -0.68. So the power of the reexpression is
p = 1 − b = 1 + 0.68 = 1.68 which is approximately 2
df_reexpress = baseball
df_reexpress$AtBat = (df_reexpress$AtBat)^(3)
df_reexpress$Hits = (df_reexpress$Hits)^(2)
boxplot(AtBat~ Salary, horizontal = TRUE,
data = df_reexpress,
xlab = "AtBat^3",
ylab = "Salary", show.names=TRUE)
boxplot(AtBat~ Salary, horizontal = TRUE,
data = baseball,
xlab = "AtBat",
ylab = "Salary", show.names=TRUE)
boxplot(Hits~ Salary, horizontal = TRUE,
data = df_reexpress,
xlab = "Hits^(2)",
ylab = "Salary", show.names=TRUE)
boxplot(Hits~ Salary, horizontal = TRUE,
data = baseball,
xlab = "Hits",
ylab = "Salary", show.names=TRUE)
Re-expression for AtBat^2
- There is a slight improvement in the box plot and as we can see that
still all boxes do not have same speard.
Re-expression for Hits^(-2.5)
- By this re-expression the spread of box plots did not change
significantly. So, this re-expression does not work well for this
batch.
with(df_reexpress,
spread.level.plot(AtBat, Salary))
## GROUP medians dfs log.median log.df
## 1 2 32465587 76214818 7.511423 7.882039
## 2 1 91211400 113265711 7.960049 8.054098
## 3 3 134611712 106968280 8.129083 8.029255
## 4 4 166375000 113958114 8.221088 8.056745
## 5 5 180362125 76575180 8.256145 7.884088
## 6 6 121287375 118010516 8.083816 8.071921
with(df_reexpress,
spread.level.plot(Hits, Salary))
## GROUP medians dfs log.median log.df
## 1 2 6322.5 9464.0 3.800889 3.976075
## 2 1 13808.5 16827.0 4.140147 4.226007
## 3 3 18769.0 11494.0 4.273441 4.060471
## 4 4 20164.0 14985.5 4.304577 4.175671
## 5 5 25600.0 6585.0 4.408240 3.818556
## 6 6 22801.0 19999.5 4.357954 4.301019
a = with(df_reexpress,
spread.level.plot(AtBat, Salary))
round( a$dfs/a$medians, 3)
## [1] 2.348 1.242 0.795 0.685 0.425 0.973
a = with(df_reexpress,
spread.level.plot(Hits, Salary))
round( a$dfs/a$medians, 3)
## [1] 1.497 1.219 0.612 0.743 0.257 0.877
AtBat:
We have a significant improvement in the spread plot. The slop of
spread-level plot is -0.41 which is approximately zero. We removed the
association between spread and level.
If we divide the fourth-spreads by the corresponding medians, we
get the values:
2.348 1.242 0.795 0.685 0.425 0.973
These aren’t really that small (it should be 0.2 or smaller), explaining that the spread vs level doesn’t work very well.
Hits :
For ResringBP, also the spread-level plot shows that there
is not a association between spread and level. The value of slop of
spread-level plot reduced to -0.39 which is approximately zero.
However, by looking at the boxplots display of the re-expressed data – we do see some differences in spreads between the batches. I see a slight improvement using this re-expression for this batch.
If we divide the fourth-spreads by the corresponding medians, we
get the values:
1.497 1.219 0.612 0.743 0.257 0.877
These aren’t really that small. This explains why the spread vs level doesn’t work very well for this batch
AtBatbaseball = drop_na(df0)
baseball = baseball[which(baseball$AtBat>0),]
#aplpack::stem.leaf(baseball$AtBat, unit=10, m=1, trim.outliers=FALSE)
hist(baseball$AtBat)
letter.values <- lval(baseball$AtBat)
letter.values$mids
## [1] 413.00 404.25 399.00 397.50 393.00 384.50 401.50 350.00 353.00
plot(letter.values$mids)
symplot(baseball$AtBat)
Comments
According to the histogram chart and letter values we can see left skewness in the batch.
# P = 0.5
roots_AtBat <- sqrt(baseball$AtBat)
#aplpack::stem.leaf(roots_AtBat)
root.lv <- lval(roots_AtBat)
root.lv
## depth lo hi mids spreads
## M 132.0 20.322401 20.32240 20.32240 0.000000
## H 66.5 16.807730 22.93469 19.87121 6.126960
## E 33.5 14.611629 24.17643 19.39403 9.564803
## D 17.0 13.601471 24.69818 19.14982 11.096708
## C 9.0 12.449900 25.11971 18.78481 12.669814
## B 5.0 11.269428 25.33772 18.30357 14.068291
## A 3.0 11.224972 26.01922 18.62210 14.794252
## Z 2.0 4.472136 26.07681 15.27447 21.604674
## Y 1.0 4.358899 26.21068 15.28479 21.851786
plot(root.lv$mids)
log_AtBat <- log(baseball$AtBat)
#aplpack::stem.leaf(log_AtBat)
logs.lv <- lval(log_AtBat)
logs.lv
## depth lo hi mids spreads
## M 132.0 6.023448 6.023448 6.023448 0.0000000
## H 66.5 5.643677 6.265301 5.954489 0.6216242
## E 33.5 5.363634 6.370756 5.867195 1.0071223
## D 17.0 5.220356 6.413459 5.816907 1.1931031
## C 9.0 5.043425 6.447306 5.745365 1.4038807
## B 5.0 4.844187 6.464588 5.654388 1.6204012
## A 3.0 4.836282 6.517671 5.676977 1.6813894
## Z 2.0 2.995732 6.522093 4.758913 3.5263605
## Y 1.0 2.944439 6.532334 4.738387 3.5878953
plot(logs.lv$mids)
squared_AtBat <- (baseball$AtBat)^(2)
#aplpack::stem.leaf(logs_AtBat)
logs.lv <- lval(squared_AtBat)
logs.lv
## depth lo hi mids spreads
## M 132.0 170569.0 170569.0 170569.0 0.0
## H 66.5 79806.5 276676.0 178241.2 196869.5
## E 33.5 45582.5 341640.5 193611.5 296058.0
## D 17.0 34225.0 372100.0 203162.5 337875.0
## C 9.0 24025.0 398161.0 211093.0 374136.0
## B 5.0 16129.0 412164.0 214146.5 396035.0
## A 3.0 15876.0 458329.0 237102.5 442453.0
## Z 2.0 400.0 462400.0 231400.0 462000.0
## Y 1.0 361.0 471969.0 236165.0 471608.0
plot(logs.lv$mids)
symplot(roots_AtBat)
symplot(log_AtBat)
symplot(squared_AtBat)
hinkley(roots_AtBat)
## h
## -0.1016702
hinkley(log_AtBat)
## h
## -0.179135
hinkley(squared_AtBat)
## h
## 0.07098735
Hitshist(baseball$Hits)
letter.values <- lval(baseball$Hits)
letter.values$mids
## [1] 103.0 106.5 108.5 108.5 118.5 121.0 120.0 113.5 119.5
plot(letter.values$mids)
symplot(baseball$Hits)
Comments
roots_Hits <- sqrt(baseball$Hits)
#aplpack::stem.leaf(roots_Hits)
root.lv <- lval(roots_Hits)
root.lv
## depth lo hi mids spreads
## M 132.0 10.148892 10.14889 10.148892 0.000000
## H 66.5 8.455716 11.89536 10.175537 3.439643
## E 33.5 7.348469 12.76715 10.057807 5.418676
## D 17.0 6.557439 13.19091 9.874172 6.633467
## C 9.0 6.244998 14.07125 10.158123 7.826249
## B 5.0 5.656854 14.49138 10.074115 8.834522
## A 3.0 5.196152 14.59452 9.895336 9.398367
## Z 2.0 2.000000 14.93318 8.466592 12.933185
## Y 1.0 1.000000 15.42725 8.213624 14.427249
plot(root.lv$mids)
recroot_Hits <- 1/sqrt(baseball$Hits)
#aplpack::stem.leaf(recroot_Hits)
logs.lv <- lval(recroot_Hits)
logs.lv
## depth lo hi mids spreads
## M 132.0 0.09853293 0.09853293 0.09853293 0.00000000
## H 66.5 0.08406666 0.11826465 0.10116566 0.03419798
## E 33.5 0.07832604 0.13608276 0.10720440 0.05775672
## D 17.0 0.07580980 0.15249857 0.11415419 0.07668877
## C 9.0 0.07106691 0.16012815 0.11559753 0.08906125
## B 5.0 0.06900656 0.17677670 0.12289163 0.10777014
## A 3.0 0.06851887 0.19245009 0.13048448 0.12393122
## Z 2.0 0.06696495 0.50000000 0.28348248 0.43303505
## Y 1.0 0.06482037 1.00000000 0.53241019 0.93517963
plot(logs.lv$mids)
log_Hits <- log(baseball$Hits)
#aplpack::stem.leaf(logs_AtBat)
logs.lv <- lval(log_Hits)
logs.lv
## depth lo hi mids spreads
## M 132.0 4.634729 4.634729 4.634729 0.0000000
## H 66.5 4.269673 4.952293 4.610983 0.6826205
## E 33.5 3.988984 5.093750 4.541367 1.1047662
## D 17.0 3.761200 5.159055 4.460128 1.3978552
## C 9.0 3.663562 5.288267 4.475914 1.6247054
## B 5.0 3.465736 5.347108 4.406422 1.8813716
## A 3.0 3.295837 5.361292 4.328565 2.0654553
## Z 2.0 1.386294 5.407172 3.396733 4.0208774
## Y 1.0 0.000000 5.472271 2.736135 5.4722707
plot(logs.lv$mids)
symplot(roots_Hits)
symplot(recroot_Hits)
symplot(log_Hits)
hinkley(roots_Hits)
## h
## -0.006195716
hinkley(recroot_Hits)
## h
## 0.2790026
hinkley(log_Hits)
## h
## -0.1015827
mtrans functionraw_Chol <- baseball$AtBat
matched.roots <- mtrans(raw_Chol, 0.5)
matched.square <- mtrans(raw_Chol, 2)
matched.logs <- mtrans(raw_Chol, 0)
boxplot(data.frame(raw_Chol, matched.roots, matched.logs, matched.square))
raw_Rest <- baseball$Hits
matched.roots <- mtrans(raw_Rest, 0.5)
matched.recroots <- mtrans(raw_Rest, -0.5)
matched.logs <- mtrans(raw_Rest, 0)
boxplot(data.frame(raw_Rest, matched.roots, matched.logs, matched.recroots))
Conclusion
According to the box plots, for AtBat batch the best
transformation is power transformation with p=2, and for
Hits batch the best one is root transformation (0.5). In
this way right and left skewness have been removed for these
batches.
Salary
vs. AtBatdf_1 = drop_na(df0)
df_1 = df_1[which(df_1$AtBat>0),]
dat1 = df_1[ , c("AtBat", "Hits", "Salary")]
dat1 <- data.frame(dat1)
df <- dat1[order(dat1$Salary), ]
nn = floor(nrow(df)/3)
baseball = df[1:nn,]
df2 = df[(nn+1):(2*nn), ]
df3 = df[((2*nn)+1):nrow(df) , ]
with(dat1,plot(dat1$Salary ,dat1$AtBat,
xlab= "Salary ",
ylab= "AtBat",
main= "Salary vs. AtBat"))
summary.points <- data.frame(x = c(median(baseball$Salary ), median(df2$Salary ),median(df3$Salary )),
y = c(median(baseball$AtBat), median(df2$AtBat), median(df3$AtBat)))
points(summary.points, cex=2, pch=19, col="red")
myfit <- rline(AtBat ~ Salary, dat1)
with(dat1,plot(dat1$Salary ,dat1$AtBat,
xlab= "Salary ",
ylab= "AtBat",
main= "Salary vs. AtBat"))
summary.points <- data.frame(x = c(median(baseball$Salary ), median(df2$Salary ),median(df3$Salary )),
y = c(median(baseball$AtBat), median(df2$AtBat), median(df3$AtBat)))
points(summary.points, cex=2, pch=19, col="red")
curve(myfit$b * (x - myfit$xC) + myfit$a,
add=TRUE, col="red")
plot(dat1$Salary, myfit$residual,
xlab = "Salary",
ylab = "Residual",
main = "Residual VS. Salary ")
abline(h=0, lwd=2)
The slope is m = 0.2552614 , so this means that the level of AtBat in people has generally been increasing at a rate of 0.2552614 per Salary. That’s a pretty a good explenation how Salary can influence the level of AtBat in a person. By increasing the Salary by one year the level of AtBat increases by 0.2552614.
Salary
vs. Hitswith(dat1,plot(dat1$Salary ,dat1$Hits,
xlab= "Salary ",
ylab= "Hits",
main= "Salary vs. Hits"))
summary.points <- data.frame(x = c(median(baseball$Salary ), median(df2$Salary ),median(df3$Salary )),
y = c(median(baseball$Hits), median(df2$Hits), median(df3$Hits)))
points(summary.points, cex=2, pch=19, col="red")
myfit <- rline(Hits ~ Salary, dat1)
with(dat1,plot(dat1$Salary ,dat1$Hits,
xlab= "Salary ",
ylab= "Hits",
main= "Salary vs. Hits"))
summary.points <- data.frame(x = c(median(baseball$Salary ), median(df2$Salary ),median(df3$Salary )),
y = c(median(baseball$Hits), median(df2$Hits), median(df3$Hits)))
points(summary.points, cex=2, pch=19, col="red")
curve(myfit$b * (x - myfit$xC) + myfit$a,
add=TRUE, col="red")
plot(dat1$Salary, myfit$residual,
xlab = "Salary",
ylab = "Residual",
main = "Residual VS. Salary ")
abline(h=0, lwd=2)
According to the resistant fit the slope is
m = 0.0712831 , so this means that the level of
Hits in people has generally been increasing at a rate of
m = 0.0712831 per Salary, explaining how
Salary can influence the level of Hits in a
person.
AtBatbaseball = drop_na(df0)
baseball = baseball[ , c("AtBat", "Hits")]
baseball = baseball[which(baseball$AtBat>0),]
bins <- seq(min(baseball$AtBat, na.rm = TRUE), max(baseball$AtBat, na.rm = TRUE), length.out = 50)
bin.mids <- (bins[-1] + bins[-length(bins)]) / 2
with(baseball,
hist(AtBat, breaks = bins, xlab = "Height", main = ""))
a = fivenum(baseball$AtBat)
b = lval(baseball$AtBat)
FL = a[2] # lower fourth
FU = a[4] # upper fourth
m = (FL + FU) / 2 # m = b[2,4]
cat("The matching mean: m=", m)
## The matching mean: m= 404.25
st = (FU - FL) / 1.349 # s = the Gaussian standard deviation
cat("The Gaussian standard deviation: st =", st)
## The Gaussian standard deviation: st = 180.5041
# So our matching Gaussian curve is N(m, st).
s <- fit.gaussian(baseball$AtBat, bins, m, st)
options(digits=3)
df = data.frame(Mid=bin.mids, d=s$counts, sqrt.d=sqrt(s$counts),
Prob=s$probs, e=s$expected, sqrt.e=sqrt(s$expected),
Residual=s$residual, DDR = ((sqrt(2+4*s$counts) - sqrt(1+4*s$expected))))
h <- with(baseball,
hist(AtBat, breaks = bins, xlab = "AtBat", main = ""))
data.frame(Mid=h$mids, Count=h$counts, Roots=sqrt(h$counts))
## Mid Count Roots
## 1 25.8 2 1.41
## 2 39.4 0 0.00
## 3 53.1 0 0.00
## 4 66.7 0 0.00
## 5 80.3 0 0.00
## 6 94.0 0 0.00
## 7 107.6 0 0.00
## 8 121.2 3 1.73
## 9 134.9 0 0.00
## 10 148.5 4 2.00
## 11 162.1 2 1.41
## 12 175.8 3 1.73
## 13 189.4 6 2.45
## 14 203.0 10 3.16
## 15 216.7 11 3.32
## 16 230.3 5 2.24
## 17 243.9 3 1.73
## 18 257.6 6 2.45
## 19 271.2 6 2.45
## 20 284.8 11 3.32
## 21 298.5 3 1.73
## 22 312.1 12 3.46
## 23 325.7 10 3.16
## 24 339.4 7 2.65
## 25 353.0 5 2.24
## 26 366.6 4 2.00
## 27 380.3 8 2.83
## 28 393.9 3 1.73
## 29 407.5 8 2.83
## 30 421.2 9 3.00
## 31 434.8 8 2.83
## 32 448.4 4 2.00
## 33 462.1 6 2.45
## 34 475.7 9 3.00
## 35 489.3 11 3.32
## 36 503.0 5 2.24
## 37 516.6 12 3.46
## 38 530.2 7 2.65
## 39 543.9 6 2.45
## 40 557.5 5 2.24
## 41 571.1 9 3.00
## 42 584.8 15 3.87
## 43 598.4 7 2.65
## 44 612.0 5 2.24
## 45 625.7 5 2.24
## 46 639.3 4 2.00
## 47 652.9 0 0.00
## 48 666.6 1 1.00
## 49 680.2 3 1.73
h$counts <- sqrt(h$counts)
plot(h, xlab = "AtBat", ylab = "ROOT FREQUENCY", main = "")
lines(bin.mids, sqrt(s$expected))
h <- with(baseball,
hist(AtBat, breaks = bins, xlab = "AtBat", main = ""))
data.frame(Mid=h$mids, Count=h$counts, Roots=sqrt(h$counts))
## Mid Count Roots
## 1 25.8 2 1.41
## 2 39.4 0 0.00
## 3 53.1 0 0.00
## 4 66.7 0 0.00
## 5 80.3 0 0.00
## 6 94.0 0 0.00
## 7 107.6 0 0.00
## 8 121.2 3 1.73
## 9 134.9 0 0.00
## 10 148.5 4 2.00
## 11 162.1 2 1.41
## 12 175.8 3 1.73
## 13 189.4 6 2.45
## 14 203.0 10 3.16
## 15 216.7 11 3.32
## 16 230.3 5 2.24
## 17 243.9 3 1.73
## 18 257.6 6 2.45
## 19 271.2 6 2.45
## 20 284.8 11 3.32
## 21 298.5 3 1.73
## 22 312.1 12 3.46
## 23 325.7 10 3.16
## 24 339.4 7 2.65
## 25 353.0 5 2.24
## 26 366.6 4 2.00
## 27 380.3 8 2.83
## 28 393.9 3 1.73
## 29 407.5 8 2.83
## 30 421.2 9 3.00
## 31 434.8 8 2.83
## 32 448.4 4 2.00
## 33 462.1 6 2.45
## 34 475.7 9 3.00
## 35 489.3 11 3.32
## 36 503.0 5 2.24
## 37 516.6 12 3.46
## 38 530.2 7 2.65
## 39 543.9 6 2.45
## 40 557.5 5 2.24
## 41 571.1 9 3.00
## 42 584.8 15 3.87
## 43 598.4 7 2.65
## 44 612.0 5 2.24
## 45 625.7 5 2.24
## 46 639.3 4 2.00
## 47 652.9 0 0.00
## 48 666.6 1 1.00
## 49 680.2 3 1.73
h$counts <- sqrt(h$counts)
plot(h, xlab = "TIME", ylab = "ROOT FREQUENCY", main = "")
rootogram(s$counts, s$expected)
rootogram(s$counts, s$expected, type="deviation")
The red dots correspond to the square root of the fitted counts (from the Gaussian distribution). We “hang” the actual bin counts (the bars) on the fits. Any bars that don’t land on the value 0 correspond to some deviations from normality.
First thing I notice is large negative and then positive
deviations at the high end. This means that we see more large AtBat
levels than we’d predict from the Gaussian curve.
Also, I notice positive deviations at the left side of the plot.
This means there are fewer small AtBat levels than we’d predict assuming
normality.
Also, there is a large negative deviation in next to the middle of the distribution.
My conclusion is that roots of AtBat levels are not quite Gaussian distributed.
Hitsbins <- seq(min(baseball$Hits, na.rm = TRUE), max(baseball$Hits, na.rm = TRUE), length.out = 20)
bin.mids <- (bins[-1] + bins[-length(bins)]) / 2
with(baseball,
hist(Hits, breaks = bins, xlab = "Height", main = ""))
a = fivenum(baseball$Hits)
b = lval(baseball$Hits)
FL = a[2] # lower fourth
FU = a[4] # upper fourth
m = (FL + FU) / 2 # m = b[2,4]
cat("The matching mean: m=", m)
## The matching mean: m= 106
st = (FU - FL) / 1.349 # s = the Gaussian standard deviation
cat("The Gaussian standard deviation: st =", st)
## The Gaussian standard deviation: st = 51.9
# So our matching Gaussian curve is N(m, st).
s <- fit.gaussian(baseball$Hits, bins, m, st)
options(digits=3)
df = data.frame(Mid=bin.mids, d=s$counts, sqrt.d=sqrt(s$counts),
Prob=s$probs, e=s$expected, sqrt.e=sqrt(s$expected),
Residual=s$residual, DDR = ((sqrt(2+4*s$counts) - sqrt(1+4*s$expected))))
h <- with(baseball,
hist(Hits, breaks = bins, xlab = "Hits", main = ""))
data.frame(Mid=h$mids, Count=h$counts, Roots=sqrt(h$counts))
## Mid Count Roots
## 1 7.24 2 1.41
## 2 19.71 0 0.00
## 3 32.18 4 2.00
## 4 44.66 18 4.24
## 5 57.13 25 5.00
## 6 69.61 23 4.80
## 7 82.08 29 5.39
## 8 94.55 19 4.36
## 9 107.03 26 5.10
## 10 119.50 19 4.36
## 11 131.97 25 5.00
## 12 144.45 23 4.80
## 13 156.92 21 4.58
## 14 169.39 14 3.74
## 15 181.87 6 2.45
## 16 194.34 3 1.73
## 17 206.82 4 2.00
## 18 219.29 1 1.00
## 19 231.76 1 1.00
h$counts <- sqrt(h$counts)
plot(h, xlab = "Hits", ylab = "ROOT FREQUENCY", main = "")
lines(bin.mids, sqrt(s$expected))
h <- with(baseball,
hist(Hits, breaks = bins, xlab = "Hits", main = ""))
data.frame(Mid=h$mids, Count=h$counts, Roots=sqrt(h$counts))
## Mid Count Roots
## 1 7.24 2 1.41
## 2 19.71 0 0.00
## 3 32.18 4 2.00
## 4 44.66 18 4.24
## 5 57.13 25 5.00
## 6 69.61 23 4.80
## 7 82.08 29 5.39
## 8 94.55 19 4.36
## 9 107.03 26 5.10
## 10 119.50 19 4.36
## 11 131.97 25 5.00
## 12 144.45 23 4.80
## 13 156.92 21 4.58
## 14 169.39 14 3.74
## 15 181.87 6 2.45
## 16 194.34 3 1.73
## 17 206.82 4 2.00
## 18 219.29 1 1.00
## 19 231.76 1 1.00
h$counts <- sqrt(h$counts)
plot(h, xlab = "TIME", ylab = "ROOT FREQUENCY", main = "")
rootogram(s$counts, s$expected)
rootogram(s$counts, s$expected, type="deviation")
According to the hang plot: